2022-12-10 prototype "HTML-oddmuse-to-tiddlywiki-conversion.prl" code
#! /usr/bin/perl
# oddmuse-HTML-to-tiddlywiki-conversion.prl
# ^z - 2003-12-31 2007-07-15 2007-12-25 2022-11-27 2022-12-07 2022-12-10
#
# usage: perl oddmuse-HTML-to-tiddlywiki-conversion.prl indir outdir
#
# take all files in "indir", convert contents, replace '.html' with '.tid' at the end of their names, store results in "outdir"
# indir and outdir must already exist
#
# process:
# mass-export Oddmuse ZhurnalyWiki files as "static" HTML files to directory "indir"
# for each individual HTML file:
# make the first line of each file say title: New-wiki-page-name\n\n
# (where New-wiki-page-name is the file name minus .html with "_" changed to " ")
# delete HTML header stuff down to and including <div class="content browse"><p>
# delete HTML footer stuff starting with </div><div class="wrapper close"> to the end
# delete all </p>
# fix external links like <a class="url http outside" href="URL">label</a>
# fix internal links like <a class="local" href="URL">Wiki Page Name</a>
# do other conversion such as <strong>...</strong> or <em>...</em> as desired (see below)
# replace .html with .tid at the end of each file name
# drag-and-drop into an empty TiddlyWiki to import!
$indir = $ARGV[0];
$outdir = $ARGV[1];
opendir(INDIR, "$indir") or die "couldn't open input directory $indir";
opendir(OUTDIR, "$outdir") or die "couldn't open output directory $outdir";
@pages = grep !/^\./, readdir INDIR;
undef $/; # grab entire file at once
foreach $page (@pages) {
if ( -e "$indir/$page" ) {
open(F, "$indir/$page") or die "$page: $!";
print " $page ... ";
$body = <F>;
close(F);
# convert Oddmuse ZhurnalyWiki HTML to TiddlyWiki markup (mostly!)
# remove prefix HTML to <div class="content browse">
$body =~ s/^.*?<div class="content browse">//sg;
# remove suffix HTML from </div><div class="wrapper close">
$body =~ s/<\/div><div class="wrapper close">.*$//sg;
# change all <p> to \n\n
$body =~ s/<p>/\n\n/sg;
# remove all </p>
$body =~ s/<\/p>//sg;
# change external links to TiddlyWiki markup
$body =~ s/<a class="url http outside" href="(.*?)">(.*?)<\/a>/[[$2\|$1]]/sg;
# change local links to TiddlyWiki markup (ASSUME label is proper Tiddler name!)
$body =~ s/<a class="local".*?>(.*?)<\/a>/[[$1]]/sg;
# fix BOLD (STRONG)
$body =~ s/<strong>(.*?)<\/strong>/''$1''/sg;
# fix ITALIC (EM)
$body =~ s/<em>(.*?)<\/em>/\/\/$1\/\//sg;
# fix BLOCKQUOTE
$body =~ s/<blockquote>(.*?)<\/blockquote>/\n\n<<<\n$1\n<<<\n/sg;
# fix LIST (ASSUME list is all one-level UNORDERED bullets!)
$body =~ s/<ul>/\n\n/sg;
$body =~ s/<ol>/\n\n/sg;
$body =~ s/<li>(.*?)<\/li>/\n* $1\n/sg;
$body =~ s/<\/ul>/\n\n/sg;
$body =~ s/<\/ol>/\n\n/sg;
# fix HORIZONTAL RULE and put empty line above it and \n after it
$body =~ s/<hr \/>/\n\n---\n/sg;
# fix HEADER markup
$body =~ s/<h1>/\n\n!/sg;
$body =~ s/<h2>/\n\n!!/sg;
$body =~ s/<h3>/\n\n!!!/sg;
$body =~ s/<h4>/\n\n!!!!/sg;
$body =~ s/<h5>/\n\n!!!!!/sg;
$body =~ s/<h6>/\n\n!!!!!!/sg;
$body =~ s/<\/h[1-6]>/\n/sg;
# fix ESCAPED words
$body =~ s/<code>(.+?)<\/code>/``$1``/sg;
# fix ESCAPED lines
$body =~ s/<pre class="real">/\n\n```\n/sg;
$body =~ s/<\/pre>/\n```\n/sg;
=begin
# GET RID OF OLD STUFF some day!!!
# convert Oddmuse markup to TiddlyWiki markup, as much as is reasonably possible
# BOLD
$body =~ s/\*\*/\'\'/sg; # BOLD: ** becomes ''
# BLOCKQUOTE
$body =~ s/\"\"\"/\n<<</sg; # BLOCKQUOTE: """ becomes \n<<<
# UNORDERED LIST
$body =~ s/\n\*([^*])/\n\n\*$1/sg; # BULLET LIST: initial * gets a \n above
# ORDERED LIST
$body =~ s/\n\#([^#])/\n\n\#$1/sg; # NUMBER LIST: initial # gets a \n above
# HORIZONTAL RULE
$body =~ s/\n----/\n\n----/sg; # HORIZONTAL RULE: put a blank line above
# HEADERS
$body =~ s/\n======/\n\n!!!!!!/sg; # HEADER 6: change = to ! and put blank line above
$body =~ s/\n=====/\n\n!!!!!/sg; # HEADER 5: change = to ! and put blank line above
$body =~ s/\n====/\n\n!!!!/sg; #HEADER 4: change = to ! and put blank line above
$body =~ s/\n===/\n\n!!!/sg; # HEADER 3: change = to ! and put blank line above
$body =~ s/\n==/\n\n!!/sg; # HEADER 2: change = to ! and put blank line above
$body =~ s/\n=/\n\n!/sg; # HEADER 1: change = to ! and put blank line above
# Reference LINK [URL] (avoid greedy pattern match!)
$body =~ s/[^[]\[(http.+?)]/\[\[\*\|$1]]/sg; # [URL] --> [[*|URL]]
# Labeled URL (avoid greedy pattern match!)
$body =~ s/\[\[(http.+?)\|(.+?)\]\]/[[$2\|$1]]/sg; # LINK [[URL|label]] --> [[label|URL]]
# Inline IMAGE LINK
$body =~ s/(\shttp.+?\.jpg)/[img\[$1]]/sg; # URL.jpg --> [img[URL.jpg]]
$body =~ s/(\shttp.+?\.png)/[img\[$1]]/sg; # URL.png --> [img[URL.png]]
# Displayed IMAGE with LINK
$body =~ s/\[\[image:(http.+?)\|(.*?)\|(http.+?)\]\]/<a href="$3" target="_blank">[img["$2"\|$1]]<\/a>/sg; # [[image:URL-displayed-image|label_text|URL-linked-image]] --> <a href="URL-linked-image" target="_blank">[img["label_text"|URL-displayed-image]]</a>
# TABLES
$body =~ s/^\|(.*?)\n(.*?)\|/\|$1<br>$2\|/sg; # in a table cell \n --> <br>
$body =~ s/^\|(.*?)\n(.*?)\n(.*?)\|/\|$1<br>$2<br>$3\|/sg; # and \n...\n --> <br>...<br>
# (worry later about 3 or more \n in a table cell)
$body =~ s/\|(.*?)\\\\(.*?)\|/\|$1<br>$2\|/sg; # in a table cell \\ --> <br>
$body =~ s/\|(.*?)\\\\(.*?)\\\\(.*?)\|/\|$1<br>$2<br>$3\|/sg; # and \\...\\ --> <br>...<br>
# (worry later about 3 or more \\ in a table cell)
$body =~ s/^(\|.+?\|)$/\n$1/msg; # (note "msg"!) put blank line above every table row
# (worry later about blank lines within tables
# ESCAPED WORDS
$body =~ s/{{{(.+?)}}}/``$1``/sg; # {{{...}}} --> ``...``
=cut
# title the Tiddler using the file name without ".html" suffix and with "_" --> " "
$p_a_g_e = $page;
$p_a_g_e =~ s/\.html$//;
$p_a_g_e =~ s/_/ /g;
$body = "title: $p_a_g_e\n\n$body"; # spaced-out-title at top followed by \n
# write the results to *.tid
open(F, ">$outdir/$page.tid") or die "$page: $!";
print F $body;
close(F);
} else {
die "$page didn't exist: $!";
}
print "\n";
}